library(Seurat)
library(dplyr)
library(Rtsne)
library(RColorBrewer)
library(tidyr)
library(ggplot2)
library(gridExtra)
library(patchwork)
library(cowplot)
library(ggdendro)
gnames<- modplots::gnames
# here we need to load all the datasets that will be part of our analysis
my.samples <- list()
my.samples[[1]] <- readRDS("~/spinal_cord_paper/data/poly1_filtered_060223.rds")
#The sample name, will be used in plots and files
my.samplename <- "Gg_poly_1"
Date: 07.02.23
This script takes filtered seurat objects from ddqc and then runs our pipeline using Seurat and other packages to analyse the cells.
First, we build the Seurat object.
# are there several datasets?
my.multi <- F
my.samplenames <- c()
# put the datasets togheter, if there are many.
if (length(my.samples) > 1) {
my.multi <- T
my.se = merge(my.samples[[1]], my.samples[[2]], add.cells.ids = my.samplenames)
} else {my.se = my.samples[[1]]}
#clean
rm(my.samples)
rm(my.samplenames)
Now, we do some normalization steps, and we can see what’s the amount of UMIs, genes and percentage of MT reads
# Normalize the data. Lognorm
my.se <- NormalizeData(
my.se,
normalization.method = "LogNormalize",
scale.factor = 10000
)
# Scale the datas
all.genes <- rownames(my.se)
my.se <- ScaleData(my.se, features = all.genes)
# How does the data look like?
VlnPlot(my.se,
features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt"),
ncol = 3,
group.by = "orig.ident")
# Regress variability caused by differences in number of UMIs and percentage of MT reads
my.se <- SCTransform(
my.se,
vars.to.regress = c("percent.mt", "nCount_RNA"),
verbose = FALSE,
return.only.var.genes = F
)
Now we can score the cell cycle stage, using Seurats function. For this wee need the ortholog of the distributed cell stage marker lists.
We add the scores of stages S and G2M, as well as the difference between them to the metadata with the names: S, G2M, CC.Difference
cell cycle scoring
# Load the orhtology table
ortho_gg_mm_v102 <- readRDS("~/spinal_cord_paper/data/ortho_gg_mm_v102.rds")
colnames(ortho_gg_mm_v102) <- c(
"GG_gene_ID",
"GG_gene_Name",
"MM_gene_ID",
"MM_gene_Name",
"ortho_conf",
"homolog_type"
)
# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat. We can
# segregate this list into markers of G2/M phase and markers of S phase
s.genes <- ortho_gg_mm_v102 %>%
dplyr::mutate(MM_gene_Name = toupper(MM_gene_Name)) %>%
dplyr::filter(MM_gene_Name %in% cc.genes$s.genes) %>%
dplyr::arrange(match(MM_gene_Name, cc.genes$s.genes)) %>%
dplyr::pull(GG_gene_ID)
g2m.genes <- ortho_gg_mm_v102 %>%
dplyr::mutate(MM_gene_Name = toupper(MM_gene_Name)) %>%
dplyr::filter(MM_gene_Name %in% cc.genes$g2m.genes) %>%
dplyr::arrange(match(MM_gene_Name, cc.genes$g2m.genes)) %>%
dplyr::pull(GG_gene_ID)
t0 <- Sys.time()
my.se <- CellCycleScoring(
my.se,
s.features = s.genes,
g2m.features = g2m.genes,
set.ident = TRUE
)
paste0("Seurats CC scoring took ", Sys.time() - t0, " seconds to run.")
my.se$CC.Difference.seurat <- my.se$S.Score - my.se$G2M.Score
# view cell cycle scores and phase assignments
head(my.se[[]])
rm(t0, s.genes, g2m.genes)
cc_marker <- ortho_gg_mm_v102 %>%
dplyr::filter(GG_gene_Name %in% c("PCNA", "TOP2A", "MCM6")) %>%
dplyr::arrange(match(GG_gene_Name, c("PCNA", "TOP2A", "MCM6")))
# Visualize the distribution of cell cycle markers across
Rplot <- RidgePlot(my.se, features = cc_marker$GG_gene_ID, ncol = 2, combine = FALSE)
Rplot[[1]] + ggtitle("PCNA") +
Rplot[[2]] + ggtitle("TOP2A") +
Rplot[[3]] + ggtitle("MCM6") +
patchwork::plot_layout(guides = "collect",
nrow = 2)
rm(ortho_gg_mm_v102, cc_marker)
cc1 <- ggplot(my.se[[]]) +
geom_point(aes(
x = S.Score,
y = G2M.Score,
color = Phase
)) +
theme_cowplot() +
ggtitle("Seurat CC scores") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(color = "Seurat Phase")
cc3 <- ggplot(my.se[[]]) +
geom_point(aes(
x = S.Score,
y = G2M.Score,
color = CC.Difference.seurat
)) +
geom_abline(slope = 1, intercept = 0, lty = "dashed") +
scale_color_gradient2(
low = "blue",
mid = "grey",
high = "red",
midpoint = 0
) +
theme_cowplot() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(color = "Seurat CC.Diff")
cc1 + cc3 +
plot_layout(
guides = "collect",
nrow = 1
)
rm(cc1, cc3, Rplot)
We do a linear dimensionality reduction, which is the most important step in the whole analysis.
First, we see how a PCA from the untransformed data (nothing regressed) looks like.
Then, we regress UMI count, MT percent, CC.Difference, and the batches (if any). This will also chosse variable genes, based on a dispersion threshold.
We run our definite PCA and take a look again. Now, we check how many dimensions we should take into account, this requires manual inspection
# Run the bare PCA
my.se <- RunPCA(
my.se,
assay = "RNA",
features = all.genes,
npcs = 2,
verbose = F,
group.by = "Phase"
)
PCAPlot(my.se)
# Regress variables we have calculated
if (my.multi) {
my.se <- SCTransform(
my.se,
vars.to.regress = c("percent.mt", "nCount_RNA", "CC.Difference.seurat", "orig.ident"),
verbose = FALSE,
variable.features.rv.th = 1.3
)
} else {my.se <- SCTransform(
my.se,
vars.to.regress = c("percent.mt", "nCount_RNA", "CC.Difference.seurat"),
verbose = FALSE,
variable.features.rv.th = 1.3
)
}
# Run the actual PCA (by default uses var.features of assay)
my.se <- RunPCA(my.se, assay = "SCT", verbose = F)
PCAPlot(my.se, group.by = "Phase")
# Which dimensions will we choose?
hist(my.se@reductions$pca@stdev^2, breaks = 500)
# Regress variables we have calculated, except the CC.Difference this time
if (my.multi) {
my.se <- SCTransform(
my.se,
vars.to.regress = c("percent.mt", "nCount_RNA", "orig.ident"),
verbose = FALSE,
variable.features.rv.th = 1.3
)
} else {my.se <- SCTransform(
my.se,
vars.to.regress = c("percent.mt", "nCount_RNA"),
verbose = FALSE,
variable.features.rv.th = 1.3
)
}
rm(all.genes)
Here we need to manually input how the dimensions we will use. Ca. 20 are usual for our datasets.
my.dimensions=1:17
We will run the tSNE. Using the FFT-accelerated Interpolation-based t-SNE (FIt-SNE). We run two, a normal tSNE, and an exaggerated tSNE, where clusters are tighter togheter. This is located under the xtsne name.
# Get the tSNE function
source("~/Software/FIt-SNE-master/fast_tsne.R")
my.tsne = fftRtsne(my.se@reductions$pca@cell.embeddings[,my.dimensions],
max_iter = 1000,
learning_rate = round(dim(my.se)[2]/12),
initialization =(my.se@reductions$pca@cell.embeddings[,1:2]/my.se@reductions$pca@stdev[1])*0.0001,
perplexity_list = c(30, round(dim(my.se)[2]/100)),
fast_tsne_path="~/Software/FIt-SNE-master/bin/fast_tsne")
colnames(my.tsne) = c("tsne_1", "tsne_2")
rownames(my.tsne) = colnames(my.se)
my.se[["tsne"]] = CreateDimReducObject(embeddings = my.tsne, key = "tsne_", assay = DefaultAssay(my.se), global = T)
rm(my.tsne)
Here we perform the Louvain-jaccard clustering implemented in Seurat. We can see the tree of clusters, to see how the clusters relate in the PCA space.
We also use the tree, to check if any of the terminal pairs of sisters should be merged. This is determined based on a minimum of 20 DEs between the clusters.
# Find first the nearest neighbors
my.se <- FindNeighbors(object = my.se, dims=my.dimensions, verbose = F)
# Then the actual clusters
my.se <- FindClusters(object = my.se, resolution = 1.2, verbose = F, random.seed = 42)
# Check the tree of clusters, to see what's the relationship between them
my.se <- BuildClusterTree(my.se, dims = my.dimensions, verbose = F)
plot(Tool(object = my.se, slot = 'BuildClusterTree'))
# We are gonna check for DEs using the non integrated data. We are only gonna test genes that have variability, so we calculate variable genes
my.se <- FindVariableFeatures(my.se, assay = "RNA")
# Only the genes with variability > median
my.HVF <- HVFInfo(my.se, assay = "RNA")
my.HVF <- rownames(my.HVF)[which(my.HVF[, 3] > (median(my.HVF[, 3])))]
# We check for pairs of clusters, how mayn DEs they have. If less than n, we merge them
keep.check <- T
while (keep.check == T) {
keep.check <- F
# Check the tree of clusters, to see what's the relationship between them
my.se <-
BuildClusterTree(my.se, dims = my.dimensions, verbose = F)
# Check only the terminal sisters
to.check = ips::terminalSisters(my.se@tools$BuildClusterTree)
for (i in to.check) {
# DE between the sisters
my.DE <-
FindMarkers(
my.se,
i[1],
i[2],
test.use = "MAST",
latent.vars = c("CC.Difference.seurat"),
min.pct = 0.25,
verbose = T,
assay = "RNA",
features = my.HVF,
) %>%
dplyr::filter(abs(avg_log2FC) > 0.5) %>%
dplyr::filter(p_val_adj < 0.05)
# If less than 5, merge, and repeat
if (dim(my.DE)[1] < 5) {
cat(
paste0(
dim(my.DE)[1],
" genes differentially expressed between clusters ",
i[1],
" and ",
i[2],
" merging \n"
)
)
my.se <-
SetIdent(my.se,
cells = WhichCells(my.se, idents = i[2]),
value = i[1])
keep.check <- T
}
print(i)
}
}
rm(to.check, my.HVF, my.DE)
# renumber starting from 1
my.ID <- factor(
Idents(my.se),
levels= levels(Idents(my.se))[base::order(as.numeric(levels(Idents(my.se))))])
levels(my.ID) <- 1:length(levels(my.ID))
Idents(my.se) <- my.ID
my.se[["seurat_clusters"]] <- my.ID
# Check again the clusters
dim1 <- DimPlot(my.se, reduction = "tsne", cols = rainbow(length(levels(my.ID))), label = T, label.size = 5, pt.size = 1.5)
dim1
Plot the tree again
# tree based on PCA dims
my.se <- BuildClusterTree(my.se, dims = my.dimensions, verbose = F)
plot(Tool(object = my.se, slot = 'BuildClusterTree'))
modplots::mFeaturePlot(my.se, my.features = c("OLIG2", "SOX9", "TUBB3", "SHH", "SLC18A3", "GATA2", "PAX2", "TLX3", "IGFBP7", "HBBA", "IFI30", "MSX2", "PLP1"), gnames = gnames, size = 0.5)
Now that we have the final reductions, we’ll choose one and we look at the statistics of the cells.
# set and get dim.reduct embeddings
my.reduc <- "tsne"
emb <- data.frame(Embeddings(my.se, my.reduc))
colnames(emb) <- c("reduc_1", "reduc_2")
meta <- my.se[[]] %>%
tibble::rownames_to_column("cell_ID")
my.plots = list()
my.plots[[1]] = ggplot(emb[meta[order(meta$nCount_RNA),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$nCount_RNA)), size=2, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
theme_classic() + labs(colour="UMI count")
my.plots[[2]] = ggplot(emb[meta[order(meta$nFeature_RNA),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$nFeature_RNA)), size=2, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
theme_classic() + labs(colour="gene count")
# log1p because of a few cells with high mt percent
my.plots[[3]] = ggplot(emb[meta[order(meta$percent.mt),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=(sort(meta$percent.mt))), size=2, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c("gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
theme_classic() + labs(colour="MT percent")
my.plots[[4]] = ggplot(emb[meta[order(meta$CC.Difference.seurat),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$CC.Difference.seurat)), size=2, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
theme_classic() + labs(colour="Cell Cycle\nS-G2M")
my.plots[[5]] = ggplot(emb[meta[order(meta$percent.rb),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$percent.rb)), size=2, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
theme_classic() + labs(colour="percent.rb")
grid.arrange(grobs=my.plots, ncol=2)
rm(my.plots, emb, meta)
The MT percent plot is log1p scaled since there were a few outlier cells with up to 40 percent of mitochondrial fractions. They are not filtered out because of their high UMI (> median(UMI)) content.
To identify the different DV domains of the neuron and progenitor clusters, we plot their specific markers.
neurons <- list(dI1 = c("LHX2","LHX9","BARHL1","BARHL2","POU4F1"),
dI2 = c("LHX1","LHX5","POU4F1"),
dI3 = c("ISL1","TLX3","DRGX","POU4F1"),
dI4 = c("LBX1","PAX2","LHX1","LHX5"),
dI5 = c("LBX1","LMX1B","TLX3","DRGX","POU4F1"),
dI6 = c("LBX1","PAX2","LHX1","LHX5"),
V0 = c("EVX1","PAX2","LHX1","LHX5"),
V1 = c("EN1","PAX2","LHX1","LHX5"),
V2a = c("VSX1","SOX14","LHX3"),
V2b = c("GATA2","GATA3","TAL1"),
MN = c("MNX1","ISL1","LHX3","ISL2", "SLC18A3"))
prog <- list(dp1_3 = c("PAX6","IRX3","IRX5","MSX1","PAX3","PAX7"),
dp4 = c("PAX6","IRX3","IRX5","GSX1","PAX3","PAX7"),
dp5 = c("PAX6","IRX3","IRX5","DBX2","GSX1","PAX3","PAX7"),
dp6 = c("PAX6","IRX3","IRX5","DBX2","LEUTX","PAX3","PAX7"),
p0 = c("PAX6","IRX3","IRX5","DBX2","LEUTX"),
p1 = c("PAX6","IRX3","IRX5","DBX2","PRDM12"),
p2 = c("PAX6","IRX3","IRX5","FOXN4","NKX6-1"),
pMN = c("PAX6","OLIG2","NKX6-1"),
p3 = c("NKX2-8","NKX2-2","NKX6-1"))
# allows to use mFeaturePlot with lapply
feat_list_plot <- function(x) {
plot <- modplots::mFeaturePlot(my.se, my.features = x,
gnames = gnames, size = 0.2, return = TRUE)
return(plot)
}
tsne_dim <- TSNEPlot(
my.se,
reduction = "tsne",
cols = rainbow(length(levels(my.ID))),
pt.size = 0.01,
label.size = 1,
label = TRUE
) +
ggplot2::theme(legend.position = "none")
plots_prog <- lapply(prog, feat_list_plot)
plots_neur <- lapply(neurons, feat_list_plot)
Not all features are present in your object! Removing: EN1
Not all features are present in your object! Removing: ISL2
pdf(paste0("~/spinal_cord_paper/figures/DV_prog_domain_",my.samplename ,".pdf"), width = 7, height = 5)
for (j in names(prog)) {
plots_prog[[j]][["tsne"]] <- tsne_dim
gridExtra::grid.arrange(grobs = plots_prog[[j]], ncol = 3, top = j)
}
dev.off()
png
2
pdf(paste0("~/spinal_cord_paper/figures/DV_neur_domain_",my.samplename ,".pdf"), width = 7, height = 5)
for (j in names(neurons)) {
plots_neur[[j]][["tsne"]] <- tsne_dim
gridExtra::grid.arrange(grobs = plots_neur[[j]], ncol = 3, top = j)
}
dev.off()
png
2
Here we do the differential expression analysis, and end up with the marker genes lists. We can also see the marker gene dot plot, for the top 2 marker genes per cluster
# Find all the marker genes, with these thresholds MAST
semarkers = FindAllMarkers(my.se,
features = my.se[["SCT"]]@var.features,
only.pos = TRUE,
min.pct = 0.25,
logfc.threshold = 0.5,
latent.vars = c("CC.Difference.seurat"),
test.use = "MAST",
assay = "RNA",
return.thresh = 0.05)
# We only keep the significant ones
semarkers <- semarkers %>%
filter(p_val_adj < 0.05) %>%
rename(Gene.stable.ID = gene) %>%
left_join(gnames, by = "Gene.stable.ID")
# Take only the top 10
semrk10 = semarkers %>% group_by(cluster) %>% top_n(-10, p_val_adj)
semrk1 = semarkers %>% group_by(cluster) %>% top_n(-1, p_val_adj)
modplots::mDotPlot2(my.se,
features = unique(semrk1$Gene.stable.ID),
cols = c("grey", "black"),
gnames = gnames, dot.scale = 6) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
DoHeatmap(my.se, semrk1$Gene.stable.ID)
saveRDS(my.se, paste0("~/spinal_cord_paper/data/",my.samplename,"_seurat_",format(Sys.Date(), "%d%m%y"),".rds"))
write.table(semarkers, sep = "\t", row.names = T, col.names = T,
file = paste0("~/spinal_cord_paper/data/",my.samplename,"_fullDE_",format(Sys.Date(), "%d%m%y"),".txt"), quote = F)
write.table(semrk10, sep = "\t", row.names = T, col.names = T,
file = paste0("~/spinal_cord_paper/data/",my.samplename,"_top10DE_",format(Sys.Date(), "%d%m%y"),".txt"), quote = F)
# Date and time of Rendering
Sys.time()
[1] "2023-02-07 12:08:36 CET"
sessionInfo()
R version 4.1.0 (2021-05-18)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /scicore/soft/apps/OpenBLAS/0.3.1-GCC-7.3.0-2.30/lib/libopenblas_sandybridgep-r0.3.1.so
locale:
[1] en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggdendro_0.1.22 cowplot_1.1.1 patchwork_1.1.1 gridExtra_2.3
[5] ggplot2_3.3.3 tidyr_1.1.3 RColorBrewer_1.1-2 Rtsne_0.15
[9] dplyr_1.0.10 SeuratObject_4.0.2 Seurat_4.0.5
loaded via a namespace (and not attached):
[1] fastmatch_1.1-0 plyr_1.8.6
[3] igraph_1.2.6 lazyeval_0.2.2
[5] sp_1.4-5 splines_4.1.0
[7] listenv_0.8.0 scattermore_0.7
[9] GenomeInfoDb_1.28.0 digest_0.6.27
[11] htmltools_0.5.1.1 ips_0.0.11
[13] fansi_0.5.0 magrittr_2.0.1
[15] memoise_2.0.0 tensor_1.5
[17] cluster_2.1.2 ROCR_1.0-11
[19] globals_0.16.2 Biostrings_2.60.0
[21] matrixStats_0.58.0 modplots_1.0.0
[23] spatstat.sparse_3.0-0 prettyunits_1.1.1
[25] colorspace_2.0-1 blob_1.2.1
[27] ggrepel_0.9.1 xfun_0.34
[29] crayon_1.4.1 RCurl_1.98-1.3
[31] jsonlite_1.7.2 spatstat.data_3.0-0
[33] phangorn_2.7.0 ape_5.5
[35] survival_3.2-11 zoo_1.8-9
[37] glue_1.6.2 polyclip_1.10-0
[39] gtable_0.3.0 zlibbioc_1.38.0
[41] XVector_0.32.0 leiden_0.3.9
[43] DelayedArray_0.18.0 SingleCellExperiment_1.14.1
[45] future.apply_1.7.0 BiocGenerics_0.38.0
[47] abind_1.4-5 scales_1.1.1
[49] pheatmap_1.0.12 DBI_1.1.1
[51] miniUI_0.1.1.1 Rcpp_1.0.7
[53] progress_1.2.2 viridisLite_0.4.0
[55] xtable_1.8-4 reticulate_1.22
[57] spatstat.core_2.1-2 bit_4.0.4
[59] stats4_4.1.0 htmlwidgets_1.5.3
[61] httr_1.4.2 ellipsis_0.3.2
[63] ica_1.0-2 XML_3.99-0.6
[65] farver_2.1.0 pkgconfig_2.0.3
[67] sass_0.4.0 uwot_0.1.10
[69] deldir_1.0-6 utf8_1.2.1
[71] labeling_0.4.2 tidyselect_1.2.0
[73] rlang_1.0.6 reshape2_1.4.4
[75] later_1.2.0 AnnotationDbi_1.54.0
[77] munsell_0.5.0 tools_4.1.0
[79] cachem_1.0.5 cli_3.4.1
[81] generics_0.1.3 RSQLite_2.2.7
[83] ggridges_0.5.3 org.Gg.eg.db_3.13.0
[85] evaluate_0.20 stringr_1.4.0
[87] fastmap_1.1.0 yaml_2.2.1
[89] goftest_1.2-2 knitr_1.41
[91] bit64_4.0.5 fitdistrplus_1.1-6
[93] purrr_0.3.4 RANN_2.6.1
[95] KEGGREST_1.32.0 pbapply_1.4-3
[97] future_1.30.0 nlme_3.1-152
[99] mime_0.10 compiler_4.1.0
[101] plotly_4.10.0 png_0.1-7
[103] spatstat.utils_3.0-1 tibble_3.1.8
[105] bslib_0.2.5.1 stringi_1.6.2
[107] highr_0.9 lattice_0.20-44
[109] Matrix_1.3-3 vctrs_0.5.1
[111] pillar_1.8.1 lifecycle_1.0.3
[113] spatstat.geom_3.0-3 lmtest_0.9-38
[115] jquerylib_0.1.4 RcppAnnoy_0.0.19
[117] data.table_1.14.0 bitops_1.0-7
[119] irlba_2.3.3 GenomicRanges_1.44.0
[121] httpuv_1.6.1 R6_2.5.0
[123] promises_1.2.0.1 KernSmooth_2.23-20
[125] IRanges_2.26.0 parallelly_1.33.0
[127] codetools_0.2-18 MASS_7.3-54
[129] assertthat_0.2.1 MAST_1.18.0
[131] SummarizedExperiment_1.22.0 withr_2.4.2
[133] sctransform_0.3.3 S4Vectors_0.30.0
[135] GenomeInfoDbData_1.2.6 hms_1.1.0
[137] mgcv_1.8-35 parallel_4.1.0
[139] quadprog_1.5-8 grid_4.1.0
[141] rpart_4.1-15 rmarkdown_2.17
[143] MatrixGenerics_1.4.0 Biobase_2.52.0
[145] shiny_1.6.0